R Package: [tidyverse]

collection of package for data manipulation

packages
Author

Tony Duan

Published

July 11, 2023

The tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.

Code
library(tidyverse)
library(gapminder)
Code
data001=gapminder
head(data001)
# A tibble: 6 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.

1 filter data with filter()

Code
data002=data001 %>% filter(country=='China',year==1997)

data002
# A tibble: 1 × 6
  country continent  year lifeExp        pop gdpPercap
  <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
1 China   Asia       1997    70.4 1230075000     2289.

2 select variable with select()

select one variable

Code
data002=data001 %>% select(country) 
data002%>% head()
# A tibble: 6 × 1
  country    
  <fct>      
1 Afghanistan
2 Afghanistan
3 Afghanistan
4 Afghanistan
5 Afghanistan
6 Afghanistan

select exclude one variable

Code
data002=data001 %>% select(-country) 
data002%>% head()
# A tibble: 6 × 5
  continent  year lifeExp      pop gdpPercap
  <fct>     <int>   <dbl>    <int>     <dbl>
1 Asia       1952    28.8  8425333      779.
2 Asia       1957    30.3  9240934      821.
3 Asia       1962    32.0 10267083      853.
4 Asia       1967    34.0 11537966      836.
5 Asia       1972    36.1 13079460      740.
6 Asia       1977    38.4 14880372      786.

3 create new variable with mutate()

mutate

Code
data002=data001 %>% mutate(pop_k=pop/1000)

data002 %>% head()
# A tibble: 6 × 7
  country     continent  year lifeExp      pop gdpPercap  pop_k
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>  <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.  8425.
2 Afghanistan Asia       1957    30.3  9240934      821.  9241.
3 Afghanistan Asia       1962    32.0 10267083      853. 10267.
4 Afghanistan Asia       1967    34.0 11537966      836. 11538.
5 Afghanistan Asia       1972    36.1 13079460      740. 13079.
6 Afghanistan Asia       1977    38.4 14880372      786. 14880.

4 create new variable and only select the new variable with transmute()

transmute

Code
data002=data001 %>% transmute(pop_k=pop/1000)

data002 %>% head()
# A tibble: 6 × 1
   pop_k
   <dbl>
1  8425.
2  9241.
3 10267.
4 11538.
5 13079.
6 14880.

5 summaries with group_by() and summarise()

group by

Code
data002=data001 %>% filter(year==1997) %>% group_by(continent) %>% summarise(
  total_pop=sum(pop) # sum
  ,count=n()     #count
  ,avg_pop=mean(pop) #mean
  ,sd_pop=sd(pop) # sd
)

data002 %>% head()
# A tibble: 5 × 5
  continent  total_pop count    avg_pop     sd_pop
  <fct>          <dbl> <int>      <dbl>      <dbl>
1 Africa     743832984    52  14304480.  19873013.
2 Americas   796900410    25  31876016.  62032823.
3 Asia      3383285500    33 102523803. 262349716.
4 Europe     568944148    30  18964805.  22748145.
5 Oceania     22241430     2  11120715   10528152.

6 arrange data with arrange()

order from small to big

Code
data002=data001 %>% filter(year==1997) %>% group_by(continent) %>% summarise(total_pop=sum(pop)) %>% arrange(total_pop)
data002 %>% head()
# A tibble: 5 × 2
  continent  total_pop
  <fct>          <dbl>
1 Oceania     22241430
2 Europe     568944148
3 Africa     743832984
4 Americas   796900410
5 Asia      3383285500

order from big to small

Code
data002=data001 %>% filter(year==1997) %>% group_by(continent) %>% summarise(total_pop=sum(pop)) %>% arrange(desc(total_pop))
data002 %>% head()
# A tibble: 5 × 2
  continent  total_pop
  <fct>          <dbl>
1 Asia      3383285500
2 Americas   796900410
3 Africa     743832984
4 Europe     568944148
5 Oceania     22241430

7 Reference

R for data science Book https://r4ds.had.co.nz/